In [ ]:
print "hello"

In [ ]:
x = 3

In [ ]:
x

In [ ]:
x = x + 1

In [ ]:
birdstrikes = sc.textFile("file:///home/bigdata/training/datasets/birdstrikes.csv")

In [ ]:
birdstrikes.count()

In [ ]:
birdstrikes.first()

In [ ]:
birdstrikes.take(2)

Partitioning


In [ ]:
birdstrikes.getNumPartitions()

In [ ]:
birdstrikes.repartition(10)

In [ ]:
birdstrikes.getNumPartitions()

Uh oh. (Immutability)


In [ ]:
rdd = birdstrikes.repartition(10)

In [ ]:
rdd.getNumPartitions()

In [ ]:
rdd.first()

In [ ]:
birdstrikes.first()

lambda functions


In [ ]:
def is_not_header(row):
    if row.startswith("id,"):
        return False
    else:
        return True

In [ ]:
rows = birdstrikes.filter(is_not_header)

In [ ]:
rows.take(2)

In [ ]:
def is_not_header(row):
    return not row.startswith("id,")

In [ ]:
rows = birdstrikes.filter(is_not_header)

In [ ]:
rows.take(5)

In [ ]:
rows = birdstrikes.filter(lambda row: not row.startswith("id,"))

In [ ]:
rows.take(5)

Pipelines


In [ ]:
s = "abc"
"a" in s

In [ ]:
"d" in s

In [ ]:
"d" not in s

In [ ]:
rows.filter(lambda row: "Airplane" not in row).filter(lambda row: "Medium" not in row).take(2)

In [ ]:
# Exercise: the the number of rows where it is not United Airlines and not in Ohio

In [ ]:
rows.filter(lambda row: "UNITED AIRLINES" not in row).filter(lambda row: "Ohio" not in row).count()

In [ ]:
# Exercise: The number where incident not in Ohio and By a Medium bird

In [ ]:
rows.filter(lambda row: "Ohio" not in row).filter(lambda row: "Medium" in row).count()

In [ ]:
# Exercise: The number of incidents not in Ohio and neither in California

In [ ]:
rows.filter(lambda row: "Ohio" not in row).filter(lambda row: "California" not in row).count()

Working with lists


In [ ]:
string = "1,2,3"

In [ ]:
string.split(",")

In [ ]:
l = string.split(",")

In [ ]:
l[0]

In [ ]:
l[1]

In [ ]:
lists = rows.map(lambda row: row.split(","))

In [ ]:
[ l[1], l[2], l[0] ]

In [ ]:
states = lists.map(lambda item: [ item[5], item[9] ])

In [ ]:
# Excercise: take 5 of those, where the state is not Colorado

In [ ]:
states.filter(lambda row: row[0] != "Colorado").take(5)

In [ ]:
#Excercise: create a new varialbe cleanstates and put in those which are not empty

In [ ]:
cleanstates = states.filter(lambda row: row[0] != '').filter(lambda row: row[1] != '')

In [ ]:
finalrdd = cleanstates.map(lambda l: [l[0], int(l[1])])

In [ ]:
finalrdd.take(5)

In [ ]:
#Sum cost
finalrdd.map(lambda l: l[1]).reduce(lambda x, y: x+y)

In [ ]:
df = states.toDF(["state","cost"])

In [ ]:
df.toPandas().head()

In [ ]:
df.registerTempTable("incidents")

In [ ]:
sqlContext.sql("SELECT * FROM incidents LIMIT 10").toPandas()

In [ ]:
sqlContext.sql("SELECT state, SUM(cost) as total_costs FROM incidents GROUP BY state LIMIT 10").toPandas()

In [ ]:
#Excercise: Select the total average (AVG) cost of incidents by bird size

In [ ]: